Tp 1 - Tweets sobre covid-19. Buscando patrones interesantes.
library("ggplot2")
library("readr")
library("dplyr")
library("highcharter")
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Registered S3 method overwritten by 'quantmod':
method from
as.zoo.data.frame zoo
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Highcharts (www.highcharts.com) is a Highsoft software product which is
not free for commercial and Governmental use
library("treemap")
library("modeest")
library("GGally")
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
Attaching package: 㤼㸱GGally㤼㸲
The following object is masked from 㤼㸱package:dplyr㤼㸲:
nasa
library("tidyverse")
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
Registered S3 method overwritten by 'httr':
method from
print.response rmutil
[30m-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.3.0 --[39m
[30m[32mv[30m [34mtibble [30m 3.0.1 [32mv[30m [34mstringr[30m 1.4.0
[32mv[30m [34mpurrr [30m 0.3.4 [32mv[30m [34mforcats[30m 0.5.0[39m
[30m-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[30m [34mtidyr[30m::[32mexpand()[30m masks [34mMatrix[30m::expand()
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31mx[30m [34mtidyr[30m::[32mpack()[30m masks [34mMatrix[30m::pack()
[31mx[30m [34mdplyr[30m::[32mrecode()[30m masks [34marules[30m::recode()
[31mx[30m [34mtidyr[30m::[32munpack()[30m masks [34mMatrix[30m::unpack()[39m
library("hrbrthemes")
Registering Windows fonts with R
NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library("tidyr")
library("VIM")
Loading required package: colorspace
Loading required package: grid
VIM is ready to use.
Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
Attaching package: 㤼㸱VIM㤼㸲
The following object is masked from 㤼㸱package:datasets㤼㸲:
sleep
library("e1071")
Attaching package: 㤼㸱e1071㤼㸲
The following object is masked from 㤼㸱package:modeest㤼㸲:
skewness
library("mice")
Attaching package: 㤼㸱mice㤼㸲
The following objects are masked from 㤼㸱package:base㤼㸲:
cbind, rbind
library("mongolite")
library("SnowballC")
library("tm")
Loading required package: NLP
Attaching package: 㤼㸱NLP㤼㸲
The following object is masked from 㤼㸱package:ggplot2㤼㸲:
annotate
Attaching package: 㤼㸱tm㤼㸲
The following object is masked from 㤼㸱package:arules㤼㸲:
inspect
library("twitteR")
Attaching package: 㤼㸱twitteR㤼㸲
The following objects are masked from 㤼㸱package:dplyr㤼㸲:
id, location
library("syuzhet")
library("tidyverse")
library("lubridate")
Attaching package: 㤼㸱lubridate㤼㸲
The following objects are masked from 㤼㸱package:dplyr㤼㸲:
intersect, setdiff, union
The following objects are masked from 㤼㸱package:arules㤼㸲:
intersect, setdiff, union
The following objects are masked from 㤼㸱package:base㤼㸲:
date, intersect, setdiff, union
library("RColorBrewer")
library("infotheo"); # Discretize variable
Attaching package: 㤼㸱infotheo㤼㸲
The following object is masked from 㤼㸱package:arules㤼㸲:
discretize
tweets <- mongo(collection = "tweets_mongo_covid19", db = "DMUBA")
names(tweets$find())
[1] "user_id" "status_id" "created_at" "screen_name"
[5] "text" "source" "is_quote" "is_retweet"
[9] "favorite_count" "retweet_count" "quote_count" "reply_count"
[13] "hashtags" "symbols" "urls_url" "urls_t_co"
[17] "urls_expanded_url" "media_url" "media_t_co" "media_expanded_url"
[21] "media_type" "ext_media_url" "ext_media_t_co" "ext_media_expanded_url"
[25] "mentions_user_id" "mentions_screen_name" "lang" "quoted_created_at"
[29] "retweet_status_id" "retweet_text" "retweet_created_at" "retweet_source"
[33] "retweet_favorite_count" "retweet_retweet_count" "retweet_user_id" "retweet_screen_name"
[37] "retweet_name" "retweet_followers_count" "retweet_friends_count" "retweet_statuses_count"
[41] "retweet_verified" "geo_coords" "coords_coords" "bbox_coords"
[45] "status_url" "name" "location" "description"
[49] "protected" "followers_count" "friends_count" "listed_count"
[53] "statuses_count" "favourites_count" "account_created_at" "verified"
[57] "profile_banner_url" "profile_background_url" "profile_image_url" "retweet_location"
[61] "retweet_description" "quoted_status_id" "quoted_text" "quoted_source"
[65] "quoted_favorite_count" "quoted_retweet_count" "quoted_user_id" "quoted_screen_name"
[69] "quoted_name" "quoted_followers_count" "quoted_friends_count" "quoted_statuses_count"
[73] "quoted_location" "quoted_description" "quoted_verified" "url"
[77] "place_url" "place_name" "place_full_name" "place_type"
[81] "country" "country_code" "lat" "lng"
[85] "display_text_width" "reply_to_status_id" "reply_to_user_id" "reply_to_screen_name"
t <- mongo(db="DMUBA", collection="tweet_type")
tweets_types <- t$find()
cat("Cantidades de tweets por tipo \n\n")
Cantidades de tweets por tipo
cat("\t* Tweets: ", nrow(tweets_types), "\n")
* Tweets: 28907
cat("\t* Solo RT: ", nrow(tweets_types[tweets_types$is_retweet & !tweets_types$is_quote,]), "\n")
* Solo RT: 17870
cat("\t* Solo QT: ", nrow(tweets_types[!tweets_types$is_retweet & tweets_types$is_quote,]), "\n")
* Solo QT: 1789
cat("\t* RT y QT: ", nrow(tweets_types[tweets_types$is_retweet & tweets_types$is_quote,]), "\n")
* RT y QT: 3416
cat("\t* TW originales: ", nrow(tweets_types[!tweets_types$is_retweet & !tweets_types$is_quote,]), "\n")
* TW originales: 5832
tweets_types$tipo <- ""
tweets_types[tweets_types$is_retweet & !tweets_types$is_quote,]$tipo <- "Solo RT"
tweets_types[!tweets_types$is_retweet & tweets_types$is_quote,]$tipo <- "Solo QT"
tweets_types[tweets_types$is_retweet & tweets_types$is_quote,]$tipo <- "RQ y RT"
tweets_types[!tweets_types$is_retweet & !tweets_types$is_quote,]$tipo <- "Original"
# names = c('Solo RT', 'Solo QT', 'RT + QT', 'Original')
# cantidades = c(nrow(tweets_types[tweets_types$is_retweet & !tweets_types$is_quote,]),
# nrow(tweets_types[!tweets_types$is_retweet & tweets_types$is_quote,]),
# nrow(tweets_types[tweets_types$is_retweet & tweets_types$is_quote,]),
# nrow(tweets_types[!tweets_types$is_retweet & !tweets_types$is_quote,])
# )
#
grafico_tipos <- data.frame(table(tweets_types$tipo))
# barplot(sort(grafico_tipos$Freq, decreasing=TRUE), legend.text=grafico_tipos$Var1, col=c('red','green','blue','brown'))
# barplot(height=sort(grafico_tipos$Freq, decreasing=TRUE), names=grafico_tipos$Var1, col=rgb(0.2,0.4,0.6,0.6) )
names(grafico_tipos) <- c("Tipos", "Cantidad")
coul <- brewer.pal(5, "Set2")
barplot(height=sort(grafico_tipos$Cantidad, decreasing=TRUE), names=grafico_tipos$Tipos, col=coul )
# coul <- brewer.pal(5, "Set2")
# png(filename="tipo_tweet.png", width=1000, bg="white")
ggplot(grafico_tipos, aes(x=reorder(Tipos, Cantidad), y=Cantidad, fill=Tipos)) +
geom_bar(stat="identity") +
scale_fill_brewer(palette="Set2") +
labs(
title = "",
subtitle = "",
caption = "",
tag = ""
) +
xlab("") +
ylab("") +
theme(plot.title = element_text(hjust = 0.5),
axis.text=element_text(size=12),
axis.text.y = element_text( margin = margin(10, 10, 10, 10)),
axis.title.x = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)),
legend.text=element_text(size=12)) +
coord_flip()
# dev.off()
tweets <- mongo(db="DMUBA", collection="tweet_completo_estadisticas")
numericos <- tweets$find()
# Tipos
numericos$Tipo <- ""
numericos[numericos$is_retweet & !numericos$is_quote,]$Tipo <- "Solo RT"
numericos[!numericos$is_retweet & numericos$is_quote,]$Tipo <- "Solo QT"
numericos[numericos$is_retweet & numericos$is_quote,]$Tipo <- "QT y RT"
numericos[!numericos$is_retweet & !numericos$is_quote,]$Tipo <- "Original"
numericos$verificado <- F
numericos[numericos$Tipo == "Solo QT",]$verificado <- numericos[numericos$Tipo == "Solo QT",]$quoted_verified
numericos[numericos$Tipo == "Original",]$verificado <- numericos[numericos$Tipo == "Original",]$verified
numericos[numericos$Tipo == "Solo RT",]$verificado <- numericos[numericos$Tipo == "Solo RT",]$retweet_verified
numericos[numericos$Tipo == "QT y RT",]$verificado <- numericos[numericos$Tipo == "QT y RT",]$retweet_verified
numericos$verificado_grafico <- ""
numericos[numericos$verificado,]$verificado_grafico <- "Si"
numericos[!numericos$verificado,]$verificado_grafico <- "No"
# png(filename="tipo_x_tweet_grid2.png", width=1000, bg="white")
ggplot(data=numericos, aes(x=verificado_grafico, fill=Tipo)) +
scale_fill_brewer(palette="Set2") +
geom_bar() +
labs(
title = "",
subtitle = "",
caption = "",
tag = ""
) +
xlab("") +
ylab("") +
theme(plot.title = element_text(hjust = 0.5),
axis.text=element_text(size=10),
axis.text.y = element_text( margin = margin(10, 10, 10, 10)),
axis.title.x = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)),
legend.text=element_text(size=10),
aspect.ratio=19/19) +
facet_wrap(~ Tipo, nrow=2)
# dev.off()
ggplot(data=tweets_types, aes(x=verified, fill=tipo)) +
geom_bar() + facet_wrap(~ tipo, nrow=2)
Hay más usuarios verificados en el contenido nuevo. A su vez, hay más verificos en el contenido citado. Eso habla de que un usuario verificado crea un contenido de mayor calidad (Más difundido y novedoso).
Mientras que el usuario difusor y los retweets, si bien aumentan el alcance de los tweets, no tienen una calidad alta.
t <- mongo(db="DMUBA", collection="fechas")
tweets_fechas <- t$find()
summary(tweets_fecha)
fecha t tc
Min. :2020-04-24 23:52:38 Min. :2020-04-24 23:52:38 2020-05-02 01:44:00: 1049
1st Qu.:2020-05-02 01:46:45 1st Qu.:2020-05-02 01:46:45 2020-05-02 01:40:00: 1043
Median :2020-05-04 20:30:15 Median :2020-05-04 20:30:15 2020-05-02 01:45:00: 1041
Mean :2020-05-06 12:24:30 Mean :2020-05-06 12:24:30 2020-05-02 01:43:00: 1012
3rd Qu.:2020-05-10 12:02:52 3rd Qu.:2020-05-10 12:02:52 2020-05-02 01:41:00: 990
Max. :2020-05-15 18:24:13 Max. :2020-05-15 18:24:13 2020-05-02 01:48:00: 990
(Other) :22782
fecha_str.Length fecha_str.Class fecha_str.Mode
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
1 -none- character
[ reached getOption("max.print") -- omitted 28657 rows ]
En un primer intento de graficar, vemos que los datos estan distribuidos de una forma particular. La primera pregunta es ¿Hay alguna fecha que presentó una cantidad anómala de datos?
El 2 de mayo lo es. Sin embargo, no fue un dÃa en el que aconteció alguna cosa. Ni es feriado (1/5), ni fue dÃa de anuncios (25/4).
# plot(tweets_fecha$fecha)
# barplot(table(as.Date(tweets_fecha$fecha)))
f<- data.frame(table(as.Date(tweets_fecha$fecha)))
ggplot(data=f, aes(x=Freq)) +
geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Bin size = 3") +
theme_ipsum() +
theme(
plot.title = element_text(size=15)
)
Agrupando en fracciones menos, 5 minutos, vemos que lo que aconteció fue una ventana de captura de datos desigual. Al reducir la ventana de tiempo, vemos que hay una distribución más uniforme. Igualmente sigue planteandose la pregunta, podrÃamos analizarlo de a minutos, o con diferencias porcentuales, para ver si realmente hya algo ahÃ.
tweets_fecha$t <- ymd_hms(tweets_fecha$fecha)
tweets_fecha$tc <- cut(tweets_fecha$t, breaks = "5 min")
cant_5_min <- count(tweets_fecha, tc)
barplot(cant_5_min$n, legend.text=cant_5_min$tc)
## Tweets por fecha
tweets_fecha$t <- ymd_hms(tweets_fecha$fecha)
# Por minuto está más equilibrado)
tweets_fecha$tc <- cut(tweets_fecha$t, breaks = "1 min")
cant_5_min <- count(tweets_fecha, tc)
barplot(cant_5_min$n)
La variable temporal parece ser arbitraria.
Algo a seguir investigando es la ventana temporal entre: * Fecha creada y fecha de creacion del retweet * Fecha creada y fecha de creacion del quoted
library(ggplot2)
library(dplyr)
library("plotly")
Attaching package: 㤼㸱plotly㤼㸲
The following object is masked from 㤼㸱package:ggplot2㤼㸲:
last_plot
The following object is masked from 㤼㸱package:stats㤼㸲:
filter
The following object is masked from 㤼㸱package:graphics㤼㸲:
layout
library(hrbrthemes)
# tweets_fecha$fecha
tweets_fecha$fecha_str <- lapply(tweets_fecha$tc, as.character)
b <- as.POSIXlt(strptime(tweets_fecha$tc, format = "%H:%M:%S"))
cant_5_min$fecha <- as.Date(cant_5_min$tc)
cant_5_min$hora <- format(strptime(cant_5_min$tc, format = "%Y-%m-%d %H:%M:%S"), format="%H:%M:%S")
p <- cant_5_min %>%
ggplot( aes(x=reorder(hora, hora), y=n, fill=n)) +
geom_bar(stat="identity") +
scale_fill_gradient2(low='red', mid='snow3', high='darkgreen', space='Lab') +
labs(
title = "",
subtitle = "",
caption = "",
tag = ""
) +
xlab("") +
ylab("") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
facet_wrap(~ fecha, nrow=4)
png(filename="tipo_x_tweet.png", width=1000, bg="white")
p
dev.off()
null device
1
# Turn it interactive with ggplotly
p <- ggplotly(p)
argument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NA
p
tweets_text <- tweets$aggregate('[{
"$project": {
"_id": "$_id",
"text": "$text"
}
}
]')
# summary(tweets_text)
# # tweets_text$cantChars <- nchar(tweets_text$text)
# summary(tweets_text)
# Por hay un tweet de 900 char? será por url o cosas as�
boxplot(tweets_text$cantChars)
no non-missing arguments to min; returning Infno non-missing arguments to max; returning -InfError in plot.window(xlim = xlim, ylim = ylim, log = log, yaxs = pars$yaxs) :
need finite 'ylim' values
ggplot(tweets_text, aes(x=cantChars)) +
geom_density(fill="#69b3a2", color="#e9ecef", alpha=0.8) +
ggtitle("Cantidad de catacteres por tweet") +
theme_ipsum()
tweets_text.df2 <- tweets_text
tweets_text.df2$text <- gsub("http.*","",tweets_text.df2$text)
tweets_text.df2$text <- gsub("https.*","",tweets_text.df2$text)
#Quitando los hashtags y usuarios en los tweets_text
tweets_text.df2$text <- gsub("#\\w+","",tweets_text.df2$text)
tweets_text.df2$text <- gsub("@\\w+","",tweets_text.df2$text)
tweets_text.df2$cantChars <- nchar(tweets_text.df2$text)
ggplot(tweets_text.df2, aes(x=cantChars)) +
geom_density(fill="#69b3a2", color="#e9ecef", alpha=0.8) +
ggtitle("Cantidad de catacteres por tweet") +
theme_ipsum()
tweets_text.df2$text <- gsub("[[:punct:]]","",tweets_text.df2$text)
tweets_text.df2$text <- gsub("\\w*[0-9]+\\w*\\s*", "",tweets_text.df2$text)
tweets_text.df2$cantChars <- nchar(tweets_text.df2$text)
ggplot(tweets_text.df2, aes(x=cantChars)) +
geom_density(fill="#69b3a2", color="#e9ecef", alpha=0.8) +
ggtitle("Sin caracteres especiales y numeros") +
theme_ipsum()
user_estadisticas <- mongo(db="DMUBA", collection="user_estadisticas")
summary(info_user)
user_id screen_name name description followers_count friends_count
Length:25435 Length:25435 Length:25435 Length:25435 Min. : 0 Min. : 0
Class :character Class :character Class :character Class :character 1st Qu.: 92 1st Qu.: 184
Mode :character Mode :character Mode :character Mode :character Median : 313 Median : 441
Mean : 12581 Mean : 1242
3rd Qu.: 999 3rd Qu.: 1073
Max. :18609108 Max. :971277
listed_count statuses_count favourites_count account_created_at verified user_popularity
Min. : 0.00 Min. : 1 Min. : 0 Min. :2007-02-15 14:03:49 Min. :0.00001 Length:25435
1st Qu.: 0.00 1st Qu.: 2209 1st Qu.: 1116 1st Qu.:2011-05-13 00:01:30 1st Qu.:0.00001 Class :character
Median : 1.00 Median : 9386 Median : 5514 Median :2013-12-25 05:01:22 Median :0.00001 Mode :character
Mean : 51.04 Mean : 34748 Mean : 19285 Mean :2014-08-19 17:22:36 Mean :0.01444
3rd Qu.: 6.00 3rd Qu.: 31359 3rd Qu.: 19238 3rd Qu.:2018-04-02 03:49:06 3rd Qu.:0.00001
Max. :57770.00 Max. :7203370 Max. :1265094 Max. :2020-05-15 18:08:33 Max. :1.00000
# User base
info_user <- user_estadisticas$find()
# Con log sin los que tiene 0
data_log <- as.data.frame(apply(info_user[,5:9], 2, log))
# Log con los que tiene 0
info_user[info_user == 0] <- 0.00001
data_log_1 <- as.data.frame(apply(info_user[,5:9], 2, log))
cat("Cantidad de usuarios que han twitteado: ", nrow(info_user))
Cantidad de usuarios que han twitteado: 25435
# ggpairs(data_log)
# ggpairs(info_user[,1:5])
boxplot(info_user[,5:9])
# Con 0's
boxplot(data_log)
# Con 0.0000001's
boxplot(data_log_1)
#
# info_user$verificado <- ifelse(info_user$verified, "Verificados", "Sin verificar")
# info_user$verificado <- as.factor(info_user$verificado)
Todo: * Juntar usuarios finales, usuarios que fueron replicados * Que hace que un usuario sea más divulgado? Hay alguna medida de relevancia de un usuario? Aquellos más populares (Segun que criterio?) son de que tipo? Instituciones, usuarios comunes, bots? Que tan activos son? Influye eso? Desde que dispositivo lo hacen? Que tipo de texto crean? Que hashtags usan? De que regiones son? Hay algo interesante ah� Hay predominio de algun pais? Hay paises donde se usa más el twitter?
user_tweets_estadisticas <- mongo(db="DMUBA", collection="user_tweets_estadisticas")
# User base
info_user <- user_tweets_estadisticas$find()
summary(info_user$is_none)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 1.000 1.000 1.137 1.000 31.000
info_user[info_user == 0] <- 0.00001
data_log_1 <- as.data.frame(apply(info_user[,3:7], 2, log))
# Plot de grupos
plot(sort(data_log_1$is_rt))
plot(sort(data_log_1$is_only_rt))
plot(sort(data_log_1$is_only_qt))
plot(sort(data_log_1$is_none))
plot(sort(data_log_1$is_qt))
TODO: Binning con esto? Alinear distintos grupos en cada categoria? Solo clasificarlos?
head(info_user[order(info_user$count, decreasing = T),])
Curiosamente, los usuarios finales con más tweets son creadores. serán bots?
info_user$tipo <- ifelse(info_user$is_none > info_user$is_only_rt + info_user$is_only_rt, "Creador", "Difusor")
barplot(table(info_user$tipo))
info_user_graf <- data.frame(table(info_user$tipo))
names(info_user_graf) <- c("Tipo_usuario", "Cantidad")
png(filename="tipo_usuario_creacion.png", width=1000, bg="white")
ggplot(info_user_graf, aes(x=reorder(Tipo_usuario , Cantidad), y=Cantidad, fill=Tipo_usuario)) +
geom_bar(stat="identity") +
scale_fill_brewer(palette="Set2") +
labs(
title = "",
subtitle = "",
caption = "",
tag = ""
) +
xlab("") +
ylab("") +
theme(plot.title = element_text(hjust = 0.5),
axis.text=element_text(size=14),
axis.text.y = element_text( margin = margin(10, 10, 10, 10)),
axis.title.x = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)),
legend.text=element_text(size=14),
# theme(plot.title = element_text(hjust = 0.5),
# axis.title.x = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10))
) +
coord_flip()
dev.off()
png
2
user_estadisticas <- mongo(db="DMUBA", collection="user_estadisticas")
info_user <- user_estadisticas$find()
data_log <- as.data.frame(apply(info_user[,5:9], 2, log10))
info_user[info_user == 0] <- 0.00001
info_user[is.na(info_user)] <- 0.0001
data_log_1 <- as.data.frame(apply(info_user[,5:9], 2, log10))
Correllations
ggpairs(data_log_1)
There were 25 warnings (use warnings() to see them)
bin_eq_freq <- discretize(data_log_1$followers_count,"equalfreq", 20)
bin_eq_freq$followers_count = data_log_1$followers_count
# Por cada bin calculamos la media y reemplazamos en el atributo suavizado
for(bin in 1:20){
bin_eq_freq$suavizado[ bin_eq_freq$X==bin] = mean(bin_eq_freq$followers_count[ bin_eq_freq$X==bin])
}
# grafico Sepal.Width ordenado de menor a mayor
plot(sort(data_log_1$followers_count) , type = "p", col="red",
ylab = "followers_count", xlab = "Observaciones", main = "Dato original vs suavizado")
# Agrego la serie de la variable media
lines(sort(bin_eq_freq$suavizado),
type = "p", col="blue")
legend("topleft", legend=c("Original", "Suavizado"), col=c("red", "blue"), lty=1)
bin_eq_freq <- discretize(data_log_1$listed_count,"equalfreq", 20)
bin_eq_freq$listed_count = data_log_1$listed_count
# Por cada bin calculamos la media y reemplazamos en el atributo suavizado
for(bin in 1:20){
bin_eq_freq$suavizado[ bin_eq_freq$X==bin] = mean(bin_eq_freq$listed_count[ bin_eq_freq$X==bin])
}
# grafico Sepal.Width ordenado de menor a mayor
plot(sort(data_log_1$listed_count) , type = "p", col="red",
ylab = "listed_count", xlab = "Observaciones", main = "Dato original vs suavizado")
# Agrego la serie de la variable media
lines(sort(bin_eq_freq$suavizado),
type = "p", col="blue")
legend("topleft", legend=c("Original", "Suavizado"), col=c("red", "blue"), lty=1)
# no_na_data <- data_log_1[!is.na(data_log_1$statuses_count),]
bin_eq_freq <- discretize(data_log_1$statuses_count,"equalwidth", 5)
bin_eq_freq$statuses_count = data_log_1$statuses_count
# Por cada bin calculamos la media y reemplazamos en el atributo suavizado
for(bin in 1:5){
bin_eq_freq$suavizado[ bin_eq_freq$X==bin] = mean(bin_eq_freq$statuses_count[ bin_eq_freq$X==bin])
}
# grafico Sepal.Width ordenado de menor a mayor
plot(sort(data_log_1$statuses_count) , type = "p", col="red",
ylab = "statuses_count", xlab = "Observaciones", main = "Dato original vs suavizado")
# Agrego la serie de la variable media
lines(sort(bin_eq_freq$suavizado),
type = "p", col="blue")
legend("topleft", legend=c("Original", "Suavizado"), col=c("red", "blue"), lty=1)
# no_na_data <- data_log_1[!is.na(data_log_1$favourites_count),]
bin_eq_freq <- discretize(data_log_1$favourites_count,"equalwidth", 10)
bin_eq_freq$favourites_count = data_log_1$favourites_count
# Por cada bin calculamos la media y reemplazamos en el atributo suavizado
for(bin in 1:10){
bin_eq_freq$suavizado[ bin_eq_freq$X==bin] = mean(bin_eq_freq$favourites_count[ bin_eq_freq$X==bin])
}
# grafico Sepal.Width ordenado de menor a mayor
plot(sort(data_log_1$favourites_count) , type = "p", col="red",
ylab = "favourites_count", xlab = "Observaciones", main = "Dato original vs suavizado")
# Agrego la serie de la variable media
lines(sort(bin_eq_freq$suavizado),
type = "p", col="blue")
legend("topleft", legend=c("Original", "Suavizado"), col=c("red", "blue"), lty=1)
TODO: * Dentro de los creadores, alguno fue retweteado? Citado? Cual es el impacto de los creadores? * Dentro de los difusores, que impacto tienen? Que relevancia tienen los creadores originales? Cuando tweets fueron amplificados más de una vez en el grupo de twitteros finales? * Es muy simplista esto? Funciona? Hay dispositivos privilegiados? Usan software para publicaciones los creadores? Los difusores? * Entre los creadores, hay verificados? Hay alguna forma de evaluar la confiabilidad o la veracidad de lo que dicen? * Entre los difusores, hay fake news? Hay difusion indiscriminada? Hay relacion entre algun par de usuarios? Hay alguna persona que tiene más difusion que otra? s
There were 22 warnings (use warnings() to see them)
# png(filename="location_porc_na.png", width=1000, bg="white")
ggplot(df, aes(x=reorder(Atributo, porcentaje_na), y=porcentaje_na, fill=Atributo)) +
geom_bar(stat="identity") +
scale_fill_brewer(palette="Set2") +
labs(
title = "",
subtitle = "",
caption = "",
tag = ""
) +
xlab("") +
ylab("") +
theme(plot.title = element_text(hjust = 0.5),
axis.text=element_text(size=14),
axis.text.y = element_text( margin = margin(10, 10, 10, 10)),
axis.title.x = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)),
legend.text=element_text(size=14),
aspect.ratio = 1/1
) +
coord_flip()
# dev.off()
#Unimos continente
t <- mongo(collection = "tweets_lower", db = "DMUBA")
aux <- t$aggregate('[{"$project":{"_id": "$_id","user_id":"$user_id","screen_name":"$screen_name","text":"$description"}}]')
aux$text <- tolower(aux$text)
aux$text <- gsub("http.*","",aux$text)
aux$text <- gsub("https.*","",aux$text)
# #Quitando los hashtags y usuarios en los tweets_text
# aux$text <- gsub("#\\w+","",aux$text)
aux$text <- gsub("@\\w+","",aux$text)
aux$text <- gsub("[[:punct:]]","",aux$text)
aux$text <- gsub("\\w*[0-9]+\\w*\\s*", "",aux$text)
aux$text <- gsub("[[:punct:]]","",aux$text)
aux$text <- gsub("[^[:alnum:][:blank:]?&/\\-]", "", aux$text)
aux$text <- iconv(aux$text,from="UTF-8",to="ASCII//TRANSLIT")
palabras_noticias <- c("noticia", "periodismo", "periodista", 'periodico', "news", 'journalist', "reportero", "programa de tv", 'television', 'Reuters ', 'elpaisamerica', 'productora', 'conductor', 'columnista', 'corresponsal', 'telesur')
aux$is_news_related <- F
for (i in palabras_noticias) {
aux$is_news_related <- ifelse(grepl(i, aux$text, fixed= T), T, aux$is_news_related)
}
palabras_politica <- c("politico", "senador", "diputado", "alcalde", "subsecretario", "secretario", "secretaria", "presidencia", "presidente", "ministerio", "ministro", "ministra", "público", "publico", "canciller", "Partido Socialista", "PSUV", "partido del pueblo", 'asamblea nacional')
aux$is_politic_related <- F
for (i in palabras_politica) {
aux$is_politic_related <- ifelse(grepl(i, aux$text, fixed= T), T, aux$is_politic_related)
}
aux$tipo_user = "Normal"
aux[aux$is_news_related,]$tipo_user <- "Medio"
aux[aux$is_politic_related,]$tipo_user <- "Politica"
aux$is_news_related <- NULL
aux$is_politic_related <- NULL
aux$text <- NULL
aux[aux$tipo_user=='Politica',]
# tweets <- merge(tweets, aux, by="tweet_id")
aux %>% group_by(screen_name) %>% summarise(tipo = max(tipo_user))
# table(aux$tipo_user)
aux <- t$aggregate('[{"$project":{"_id":"$_id","user_id":"$retweet_user_id","screen_name":"$retweet_screen_name","text":"$retweet_description"}}]')
aux <- aux[!is.na(aux$screen_name),]
aux$text <- tolower(aux$text)
aux$text <- gsub("http.*","",aux$text)
aux$text <- gsub("https.*","",aux$text)
# #Quitando los hashtags y usuarios en los tweets_text
# aux$text <- gsub("#\\w+","",aux$text)
aux$text <- gsub("@\\w+","",aux$text)
aux$text <- gsub("[[:punct:]]","",aux$text)
aux$text <- gsub("\\w*[0-9]+\\w*\\s*", "",aux$text)
aux$text <- gsub("[[:punct:]]","",aux$text)
aux$text <- gsub("[^[:alnum:][:blank:]?&/\\-]", "", aux$text)
aux$text <- iconv(aux$text,from="UTF-8",to="ASCII//TRANSLIT")
palabras_noticias <- c("noticia", "periodismo", "periodista", 'periodico', "news", 'journalist', "reportero", "programa de tv", 'television', 'Reuters ', 'elpaisamerica', 'productora', 'conductor', 'columnista', 'corresponsal', 'telesur')
aux$is_news_related <- F
for (i in palabras_noticias) {
aux$is_news_related <- ifelse(grepl(i, aux$text, fixed= T), T, aux$is_news_related)
}
palabras_politica <- c("politico", "senador", "diputado", "alcalde", "subsecretario", "secretario", "secretaria", "presidencia", "presidente", "ministerio", "ministro", "ministra", "público", "publico", "canciller", "Partido Socialista", "PSUV", "partido del pueblo", 'asamblea nacional')
aux$is_politic_related <- F
for (i in palabras_politica) {
aux$is_politic_related <- ifelse(grepl(i, aux$text, fixed= T), T, aux$is_politic_related)
}
# barplot(tweets$is_news_related)
# barplot(tweets$is_politic_related)
aux$tipo_user = "Normal"
aux[aux$is_news_related,]$tipo_user <- "Medio"
aux[aux$is_politic_related,]$tipo_user <- "Politica"
aux$is_news_related <- NULL
aux$is_politic_related <- NULL
aux$text <- NULL
aux[aux$tipo_user=='Politica',]
# tweets <- merge(tweets, aux, by="tweet_id")
aux %>% group_by(screen_name) %>% summarise(tipo = max(tipo_user))
# table(aux$tipo_user)
names(user)
[1] "_id" "user_id" "retweet_screen_name" "tipo_user"
names(aux)
[1] "_id" "user_id" "screen_name" "tipo_user"